import re
import html
from pathlib import Path

import pandas as pd
from unidecode import unidecode


# Match \citep{...}, \citet[...]{...}, \citep[...][...]{...}, etc.
CITE_RE = re.compile(r"\\cite[a-zA-Z]*?(?:\[[^\]]*\])*\{([^}]+)\}")
DOI_RE = re.compile(r"(10\.[0-9]{4,9}/[-._;()/:A-Z0-9]+)", re.IGNORECASE)


def extract_citekeys(tex: str) -> set[str]:
    keys: set[str] = set()
    for m in CITE_RE.finditer(tex):
        inside = m.group(1)
        for part in inside.split(","):
            k = part.strip()
            if k:
                keys.add(k)
    return keys


def bib_escape(s: str) -> str:
    # Minimal escaping; keep braces for capitalization in titles when needed.
    return (
        str(s)
        .replace("\\", "\\\\")
        .replace("&", r"\&")
        .replace("%", r"\%")
        .replace("_", r"\_")
        .replace("{", "\\{")
        .replace("}", "\\}")
    )


def authors_to_bib(authors: str) -> str:
    # Crossref authors are usually "A; B; C". Convert to "A and B and C".
    parts = [a.strip() for a in str(authors).split(";") if a.strip()]
    return " and ".join(parts)

def _maybe_fix_mojibake(s: str) -> str:
    # Heuristic repair for common UTF-8-as-latin1 mojibake (e.g., "ZÃºÃ±iga").
    if not any(ch in s for ch in ["Ã", "Â", "â€", "â€™", "â€“", "â€”"]):
        return s
    try:
        fixed = s.encode("latin1", errors="strict").decode("utf-8", errors="strict")
        return fixed
    except Exception:
        return s


def normalize_text(value) -> str:
    if value is None:
        return ""
    try:
        if pd.isna(value):
            return ""
    except Exception:
        pass

    s = str(value)
    s = html.unescape(s)
    s = _maybe_fix_mojibake(s)

    # Replace common PDF ligatures/typography to keep pdfLaTeX happy.
    s = (
        s.replace("\ufb00", "ff")
        .replace("\ufb01", "fi")
        .replace("\ufb02", "fl")
        .replace("\ufb03", "ffi")
        .replace("\ufb04", "ffl")
        .replace("\u2019", "'")
        .replace("\u2018", "'")
        .replace("\u201c", '"')
        .replace("\u201d", '"')
        .replace("\u2013", "-")
        .replace("\u2014", "--")
    )

    s = " ".join(s.split())
    # Transliterate any remaining non-ASCII (e.g., Cyrillic) for compilation stability.
    if any(ord(ch) > 127 for ch in s):
        s = unidecode(s)
    return s.strip()


def extract_doi(value: str) -> str:
    m = DOI_RE.search(str(value or ""))
    return (m.group(1).lower() if m else "").strip()

def pick(*values) -> str:
    for v in values:
        if v is None:
            continue
        try:
            if pd.isna(v):
                continue
        except Exception:
            pass
        s = str(v).strip()
        if not s:
            continue
        if s.lower() == "nan":
            continue
        return s
    return ""

def clean_num_field(s: str) -> str:
    s = str(s or "").strip()
    if re.fullmatch(r"\d+\.0", s):
        return s.split(".", 1)[0]
    return s


def make_entry(row: pd.Series) -> str:
    citekey = str(row.get("citekey", "")).strip()
    title = normalize_text(pick(row.get("crossref_title"), row.get("title")))
    year = row.get("crossref_year") or row.get("year")
    try:
        year = int(float(year))
    except Exception:
        year = ""
    authors = normalize_text(authors_to_bib(pick(row.get("crossref_authors"), row.get("authors"))))
    container = normalize_text(pick(row.get("crossref_container_title"), row.get("venue")))
    doi = extract_doi(str(row.get("doi") or "")) or extract_doi(str(row.get("crossref_url") or "")) or extract_doi(str(row.get("url") or ""))
    volume = clean_num_field(normalize_text(pick(row.get("crossref_volume"))))
    issue = clean_num_field(normalize_text(pick(row.get("crossref_issue"))))
    pages = normalize_text(pick(row.get("crossref_pages")))

    entry_type = pick(row.get("crossref_type")).lower()
    if entry_type in {"journal-article", "article"}:
        bib_type = "article"
    elif entry_type in {"book", "monograph"}:
        bib_type = "book"
    elif entry_type in {"report", "report-series"}:
        bib_type = "techreport"
    elif entry_type in {"book-chapter"}:
        bib_type = "incollection"
    else:
        bib_type = "misc"

    fields = []
    if authors:
        fields.append(f"  author = {{{bib_escape(authors)}}}")
    if title:
        fields.append(f"  title = {{{bib_escape(title)}}}")
    if year:
        fields.append(f"  year = {{{year}}}")
    if container and bib_type == "article":
        fields.append(f"  journal = {{{bib_escape(container)}}}")
    if container and bib_type == "incollection":
        fields.append(f"  booktitle = {{{bib_escape(container)}}}")
    if bib_type == "book":
        publisher = normalize_text(pick(row.get("crossref_publisher")))
        if publisher:
            fields.append(f"  publisher = {{{bib_escape(publisher)}}}")
    if bib_type == "techreport":
        institution = normalize_text(pick(row.get("crossref_publisher")))
        if institution:
            fields.append(f"  institution = {{{bib_escape(institution)}}}")
    if volume:
        fields.append(f"  volume = {{{bib_escape(volume)}}}")
    if issue:
        fields.append(f"  number = {{{bib_escape(issue)}}}")
    if pages:
        fields.append(f"  pages = {{{bib_escape(pages)}}}")
    if doi:
        fields.append(f"  doi = {{{bib_escape(doi)}}}")
        fields.append(f"  url = {{https://doi.org/{bib_escape(doi)}}}")

    body = ",\n".join(fields)
    return f"@{bib_type}{{{citekey},\n{body}\n}}\n"


def main() -> None:
    root = Path(__file__).resolve().parents[1]
    paper = root / "paper_joc"
    cards_dir = paper / "literature"
    cards_candidates = sorted(cards_dir.glob("lit_writing_cards_v*.csv"))
    if cards_candidates:
        def _vnum(p: Path) -> int:
            m = re.search(r"_v(\d+)\.csv$", p.name)
            return int(m.group(1)) if m else -1
        cards_path = sorted(cards_candidates, key=_vnum)[-1]
    else:
        cards_path = cards_dir / "lit_writing_cards_v1.csv"
    out_bib = paper / "references.bib"

    # Gather all tex in paper_joc
    keys: set[str] = set()
    for tex_path in [paper / "main.tex", *sorted((paper / "sections").glob("*.tex"))]:
        if tex_path.exists():
            keys |= extract_citekeys(tex_path.read_text(encoding="utf-8"))

    cards = pd.read_csv(cards_path)
    cards["citekey"] = cards["citekey"].astype(str)
    by_key = {k: g.iloc[0] for k, g in cards.groupby("citekey", sort=False)}

    entries = []
    missing = []

    for k in sorted(keys):
        row = by_key.get(k)
        if row is None:
            missing.append(k)
            continue
        entries.append(make_entry(row))

    if missing:
        raise SystemExit(f"Missing citekeys in writing cards ({cards_path}): {missing}")

    out_bib.write_text("\n".join(entries), encoding="utf-8")
    print(f"Wrote: {out_bib} (entries={len(entries)})")


if __name__ == "__main__":
    main()
